#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
# @Description 正则清洗HTML
# @Time : 2020/1/4 18:46 
# @Author : sky 
# @Site :  
# @File : DealHtml.py 
# @Software: PyCharm
"""

import re


# 清洗HTML标签文本
def filter_tags(html_str):
    # 去掉多余的空格
    html_str = ' '.join(html_str.split())
    # 过滤DOCTYPE
    re_doctype = re.compile(r'<!DOCTYPE .*?>', re.S)
    res = re_doctype.sub('', html_str)
    # 过滤CDATA
    re_cdata = re.compile(r'//<!CDATA\[[ >]∗ //\] > ', re.I)
    res = re_cdata.sub('', res)
    # Script
    re_script = re.compile(r'<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
    res = re_script.sub('', res)
    # style
    re_style = re.compile(r'<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)
    res = re_style.sub('', res)  # 去掉style

    # 处理换行
    re_br = re.compile(r'<br\s*?/?>')
    res = re_br.sub('', res)  # 将br转换为换行

    # HTML标签
    re_h = re.compile(r'</?\w+[^>]*>')
    res = re_h.sub('', res)  # 去掉HTML 标签

    # HTML注释
    re_comment = re.compile(r'<!--[^>]*-->')
    res = re_comment.sub('', res)

    # 多余的空行
    blank_line = re.compile(r'\n+')
    res = blank_line.sub('', res)

    blank_line_l = re.compile(r'\n')
    res = blank_line_l.sub('', res)

    blank_kon = re.compile(r'\t')
    res = blank_kon.sub('', res)

    blank_one = re.compile(r'\r\n')
    res = blank_one.sub('', res)

    blank_two = re.compile(r'\r')
    res = blank_two.sub('', res)

    blank_three = re.compile(r' ')
    res = blank_three.sub('', res)

    # 剔除超链接
    http_link = re.compile(r'(http://.+.html)')
    res = http_link.sub('', res)
    return res


def read_file(path):
    str_doc = ''
    with open(path, encoding='utf-8') as f:
        str_doc = f.read()
    return str_doc


if __name__ == '__main__':
    str_doc = read_file('./htmldemo.txt')
    res = filter_tags(str_doc)
    print(res)
